package com.spider.service.processor;

import com.spider.model.MoveDO;
import com.spider.util.CommonUtil;
import com.xiaoleilu.hutool.util.ReUtil;
import com.xiaoleilu.hutool.util.StrUtil;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

import java.util.Date;
import java.util.List;

/**
 * @author Administrator
 * @description  阳光电影爬虫 http://www.ygdy8.com
 * @date 2018-01-02
 */
public class Ygdy8PageProcessor implements PageProcessor {

    private Site site = Site.me().setCharset("GB2312").setRetryTimes(3).setSleepTime(100);

    private static final String URL_POST = "(http://s\\.dydytt\\.net/html/gndy/(\\w+)/(\\d+)/(\\d+)\\.html)";

    private static final String URL_POST2 = "(http://www\\.ygdy8\\.com/html/(\\w+)/(\\w+)/(\\d+)/(\\d+)\\.html)";


    @Override
    public void process(Page page) {
        List<String> pathList = page.getHtml().xpath("//div[@class=\"co_area2\"]/div[@class=\"co_content8\"]/ul/table").links().regex(URL_POST).all();

        //最新电影
        List<String> topList = page.getHtml().xpath("//div[@class=\"co_area2\"]/div[@class=\"co_content8\"]/ul/table").links().regex(URL_POST2).all();

        page.addTargetRequests(pathList);

        page.addTargetRequests(topList);

        /*String keyword = ReUtil.get("(keyword=).*", page.getUrl().toString(), 0);

        if(keyword != null){
            page.putField("keyword", keyword.replace("keyword=",""));
        }*/

        Selectable content = page.getHtml().xpath("//div[@id=\"Zoom\"]/span");

        if (page.getUrl().regex(URL_POST).match() || page.getUrl().regex(URL_POST2).match()){


            //远程页面获取的数据
            String[] cntAry = CommonUtil.isBlank(ReUtil.get("(年　　代).*(<strong>)", CommonUtil.isBlank(content.toString()),0)).split("<br>◎");

            String time = "";
            String place = "";
            String category = "";
            String director = "";
            String star = "";
            String description = "";

            for (int i=0; i<cntAry.length; i++){
                if (cntAry[i].indexOf("年　　代") == 0){
                    time = CommonUtil.isBlank(cntAry[i].replace("年　　代",""));
                }
                if (cntAry[i].indexOf("产　　地") == 0){
                    place = CommonUtil.isBlank(cntAry[i].replace("产　　地",""));
                }
                if (cntAry[i].indexOf("类　　别") == 0){
                    category = CommonUtil.isBlank(cntAry[i].replace("类　　别",""));
                }
                if (cntAry[i].indexOf("导　　演") == 0){
                    director = CommonUtil.isBlank(cntAry[i].replace("导　　演",""));
                }
                if (cntAry[i].indexOf("主　　演") == 0){
                    star = CommonUtil.isBlank(cntAry[i].replace("主　　演",""));
                }
                if (cntAry[i].indexOf("简　　介") == 0){
                    description = CommonUtil.isBlank(cntAry[i].replace("简　　介","").replace("<strong>", ""));
                }
            }

            String title = CommonUtil.isBlank(page.getHtml().xpath("//div[@class=\"title_all\"]/h1/font/text()").toString());
            String topImgUrl = CommonUtil.isBlank(content.css("img", "src").toString());
            String downPath = CommonUtil.isBlank(content.css("a","href").toString());

            MoveDO move = new MoveDO();
            move.setTitle(title);
            move.setTime(time);
            move.setTopImgUrl(topImgUrl);
            move.setPlace(place);
            move.setCategory(category);
            move.setDirector(director);
            move.setStar(star);
            move.setDescription(description);
            move.setCreateDate(new Date());
            move.setCount(0);
            move.setStatus(1);
            move.setDownPath(downPath);

            if (StrUtil.isBlank(move.getDownPath())){
                page.setSkip(true);
            }else {
                page.putField("bean", move);
            }
        }

    }

    @Override
    public Site getSite() {
        return site;
    }



}
